import dalex as dx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
data = pd.read_csv('hotel_bookings.csv')
data.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
5 rows × 32 columns
data = data[['is_canceled', 'lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
'previous_cancellations', 'is_repeated_guest', 'arrival_date_month', 'deposit_type', 'customer_type']]
categorical_features = ['arrival_date_month', 'deposit_type', 'customer_type']
numeric_features = ['lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
'previous_cancellations', 'is_repeated_guest']
data = data.dropna()
X, y = data.loc[:, data.columns != 'is_canceled'], data[['is_canceled']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
categorical_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
transformers=[
('categorical', categorical_transformer, categorical_features),
('numeric', numeric_transformer, numeric_features)
])
forest = Pipeline(steps = [
('preprocessor', preprocessor)
,('regressor', RandomForestClassifier(random_state=123))
])
forest.fit(X_train, y_train)
print(f'ROC score: {roc_auc_score(y_test, forest.predict_proba(X_test)[:, 1])}')
ROC score: 0.8403758492684325
exp_forest = dx.Explainer(forest, X_train, y_train, label='random_forest')
Preparation of a new explainer is initiated -> data : 107447 rows 11 cols -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. -> target variable : 107447 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : random_forest -> predict function : <function yhat_proba_default at 0x000002271220C550> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0, mean = 0.371, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.978, mean = -0.000466, max = 0.989 -> model_info : package sklearn A new explainer has been created!
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "booking_changes", "children",
"previous_cancellations"],
type = "partial")
forest_mprofile.plot()
# as usually higher lead_time leads to bigger canceletion probability
# booking changes do make probability smaller a bit if their number is > 0
# previous cancelations make cancelation more probable as expected
# number of children doesnt is not contibuting much to probability
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:01<00:00, 2.10it/s]
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "booking_changes", "children",
"previous_cancellations"],
type = "ale")
forest_mprofile.plot()
# ale profiles look same as partial dependence profiles
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:02<00:00, 1.91it/s] Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 4/4 [00:00<00:00, 5.16it/s]
forest_mprofile = exp_forest.model_profile(variables = ["lead_time"],
groups = "children",
type = "partial")
forest_mprofile.plot()
# number of children alone may not be decisive for model,
# but they may create groups of obsrervations, that may behave differently
# for instance we may take a closer look at pdp curves for lead_time divided into groups by number of children
# when lead_time is high enough(> 300 days) customers with 2 children tend to cancel more often than those with 1 or 0,
# by smaller(< 300) lead_times however behavior of all groups looks similar
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.57it/s]
forest_mprofile = exp_forest.model_profile(variables = ["lead_time"],
groups = "babies",
type = "partial")
forest_mprofile.plot()
# same situation but now grouped by number of babies
# here one can observe that probability of cancelation is higher by families
# that have baby when lead_time is small enough(< 100 days)
# after that it is even more probable that family with no children will cancel its reservation
# it can happen so because families with babies plan their holidays better than families without
# but if lead_time is not big enough it can that having baby can break their plans
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.29it/s]